Convolutional AutoEncoder model for anomalies detection

Strategy : Device

In [1]:
# ---- utils libs ----
import numpy as np
import pandas as pd
import datetime
from typing import Optional

# --- Import functions from utils.py ---
import sys
sys.path.insert(0,'../src')

from utils import plot_confusion_matrix, plot_activity_hist, load_dataset, load_aggregate_dataset, time_in_range, segmentDf, create_sequence, train_test_split_dataset, convertToSequenceParameters

# ---- Data Viz libs ---- 
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import seaborn as sns

# ---- ML libs ----
from sklearn.preprocessing import StandardScaler

# ---- Deep Learning libs ----
from tensorflow import keras
from tensorflow.keras import layers

Load Dataset

In [2]:
dataset_resampled_1H = load_dataset("house1_power_blk2_labels.zip","60min")
df_activity_resampled_1H = dataset_resampled_1H["activity"]
dataset_resampled_1H = dataset_resampled_1H["mains"]
df_mains_resampled_1H = pd.DataFrame(dataset_resampled_1H)
df_mains_resampled_1H
Out[2]:
mains
datetime
2016-03-06 00:00:00 473.0
2016-03-06 01:00:00 736.0
2016-03-06 02:00:00 479.0
2016-03-06 03:00:00 467.0
2016-03-06 04:00:00 610.0
... ...
2016-05-07 18:00:00 1413.0
2016-05-07 19:00:00 1608.0
2016-05-07 20:00:00 1116.0
2016-05-07 21:00:00 2330.0
2016-05-07 22:00:00 594.0

1511 rows × 1 columns

Visualize Load Curve

Résumé du dataset en prenant un point toute les heures

In [3]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_mains_resampled_1H.index, y=df_mains_resampled_1H['mains'], name='Load Curve'))
fig.update_layout(showlegend=True, title='house1_power_blk2_labels_60min')
fig.show()

Remarque ci-dessus : l'activité électrique diminue globalement pour le fichier "house1_power_blk2_labels", cela peut poser problème pour la séparation train/test

Preprocessing

1/ Loads the dataset and resample timeseries

2/ Split a dataframe into train set and test set according to the split rate

3/ Standardize Data

4/ Construction of the dataset according to peak and off-peak hours or according to activity labels

5/ Creation of sequences of length T and according to the overlapping period

Return preprocessed 3D-array [samples, SEQUENCE_LENGTH, features] (i.e sequences from the timeseries) , as required for LSTM network. We want our network to have memory of 10 days, so we set SEQUENCE_LENGTH=10.

Define Global Hyperparmètre

In [4]:
TIME_STEP = datetime.timedelta(minutes=1, seconds=30) # duration of a step in the resample dataset, originally 1 second
DURATION_TIME = datetime.timedelta(minutes=60) # duration of a sequence
OVERLAP_PERIOD_PERCENT = 0.8 # 0.5 <=> 50% overlapping
TIMEFRAMES = [(datetime.time(0,0,0), datetime.time(3,0,0))] # timeframes we consider as unactive
STRATEGY = "off_peak_time" # device, off_peak_time, label 

SEQUENCE_LENGTH, OVERLAP_PERIOD = convertToSequenceParameters(TIME_STEP, DURATION_TIME, OVERLAP_PERIOD_PERCENT)

print("\t\tValeur choisie \t Equivalent sequence\nTimestep : \t {}\nDuration :\t {} \t -->  {} \nOverlap :\t {} \t\t -->  {}".format(TIME_STEP, DURATION_TIME, SEQUENCE_LENGTH, OVERLAP_PERIOD_PERCENT, OVERLAP_PERIOD))
		Valeur choisie 	 Equivalent sequence
Timestep : 	 0:01:30
Duration :	 1:00:00 	 -->  40 
Overlap :	 0.8 		 -->  32
In [5]:
df_mains_resampled_1H.loc[:, ['mains']]
Out[5]:
mains
datetime
2016-03-06 00:00:00 473.0
2016-03-06 01:00:00 736.0
2016-03-06 02:00:00 479.0
2016-03-06 03:00:00 467.0
2016-03-06 04:00:00 610.0
... ...
2016-05-07 18:00:00 1413.0
2016-05-07 19:00:00 1608.0
2016-05-07 20:00:00 1116.0
2016-05-07 21:00:00 2330.0
2016-05-07 22:00:00 594.0

1511 rows × 1 columns

In [6]:
def preprocessing(timeframes: list
                  ,sequence_length: int, overlap_period: int
                  ,resample_period :Optional[str]=None
                  ,use_labels :Optional[bool]=False
                  ,strategy :Optional[str] = "off_peak_time" 
                  ,split_rate :Optional[float]=0.2) -> np.array:
    """
    1/ Loads the dataset and resample timeseries
    2/ Split a dataframe into train set and test set according to the split rate
    3/ Standardize Data
    4/ Construction of the dataset according to peak and off-peak hours 
    or according to activity labels
    5/ Creation of sequences of length T and according to the overlapping period
    
    Args:
        - resample_period: (optional) the reasmple period, if None the default period of 1 second will be used
        - timeframes: list of tuples indicating the periods of the day ex: timeframes = [(datetime.time(10,0,0), datetime.time(6,0,0)), (datetime.time(12,0,0), datetime.time(13,0,0))
        - use_labels: (False by default) use the activities labels
        - sequence_length: length of the sequence
        - overlap_period: overlap the sequences of timeseries
        - device_approach: the aggregated load curve of the devices which, when in operation, do not allow us to predict an activity 
        - split_rate: Rate of the test set size
        - device_strategy: use inactive devices base load curve
    Returns: 
        - list of prepocessed 3D-array [samples, sequence_length, features] (i.e sequences from the timeseries) 
    """
        
    # load dataset with labels and resampled timeseries
    df_resampled = load_dataset("house1_power_blk2_labels.zip", resample_period)
    
    # split dataframe into train set and test set
    train_df, test_df = train_test_split_dataset(df_resampled)
    
    # Standardize Data
    scaler = StandardScaler()
    scaler_train = scaler.fit(train_df.loc[:, ['mains']])
    
    train_df.loc[:, 'mains'] = scaler_train.transform(train_df.loc[:, ['mains']])
    test_df.loc[:, 'mains'] = scaler_train.transform(test_df.loc[:, ['mains']])
        
    # ---- TEST SEQUENCES ----
    X_sequences_test, y_sequences_test = create_sequence(test_df, sequence_length, overlap_period)
    print('Duplicates in test_df : ', test_df.duplicated().any())
    
    if STRATEGY == "device":
        # load dataset with labels and resampled timeseries
        df_resampled_with_labels = load_dataset("house1_power_blk2_labels.zip", resample_period)
        # load dataset with inactive devices
        df_resampled_devices_inactive = load_aggregate_dataset("house1_power_blk2.zip", "inactive_house2", resample_period)
        activity = df_resampled_with_labels["activity"]
        df_resampled_device = df_resampled_devices_inactive.join(activity)
        df_resampled_device['mains'] = scaler_train.transform(df_resampled_device[['mains']])
        
        # --- TRAIN SEQUENCES ----
        X_sequence_train_device, y_sequence_train_device = create_sequence(df_resampled_device, sequence_length, overlap_period)
        
        return df_resampled_device, test_df, X_sequence_train_device, y_sequence_train_device, X_sequences_test, y_sequences_test
    
    
    if STRATEGY == "label":
    # load dataset with labels and resampled timeseries
        df_resampled_with_labels = load_dataset("house1_power_blk2_labels.zip", resample_period)
        df_resampled_with_labels = df_resampled_with_labels[df_resampled_with_labels.activity == 0]
        df_resampled_with_labels['mains'] = scaler_train.transform(df_resampled_with_labels[['mains']])
        
        # --- TRAIN SEQUENCES ----
        X_sequence_train_label, y_sequence_train_label = create_sequence(df_resampled_with_labels, sequence_length, overlap_period)
        
        return df_resampled_with_labels, test_df, X_sequence_train_label, y_sequence_train_label, X_sequences_test, y_sequences_test
    
    
    if STRATEGY == "off_peak_time":
        # --- TRAIN SEQUENCES ----
        # Construction of the dataset according to peak and off-peak hours 
        list_df_train = segmentDf(train_df, timeframes = timeframes)

        # init 3D-array [samples, sequence_length, features]
        first_df_train = list_df_train[0]
        list_X_sequence_train, list_y_sequence_train = create_sequence(first_df_train, sequence_length, overlap_period)
        list_df_train.pop(0) # delete the first element of the list of train dataframes

        # Creation of sequences of length T and according to the overlapping period
        for df_train_ in list_df_train:
            X_sequences_train, y_sequences_train = create_sequence(df_train_, sequence_length, overlap_period)
            list_X_sequence_train = np.append(list_X_sequence_train, X_sequences_train, axis = 0)
            list_y_sequence_train = np.append(list_y_sequence_train, y_sequences_train, axis = 0)
        
        return list_df_train, train_df, test_df, list_X_sequence_train, list_y_sequence_train, X_sequences_test, y_sequences_test
In [7]:
if STRATEGY == "device":
    resampled_device_train_df, test_df, X_train, y_train, X_test, y_test = preprocessing( 
                            timeframes = TIMEFRAMES
                            ,sequence_length = SEQUENCE_LENGTH
                            ,overlap_period = OVERLAP_PERIOD
                            ,resample_period = TIME_STEP
                            ,strategy = STRATEGY)
    
if STRATEGY == "label":
    resampled_label_train_df, test_df, X_train, y_train, X_test, y_test = preprocessing( 
                            timeframes = TIMEFRAMES
                            ,sequence_length = SEQUENCE_LENGTH
                            ,overlap_period = OVERLAP_PERIOD
                            ,resample_period = TIME_STEP
                            ,strategy = STRATEGY)
    
if STRATEGY == "off_peak_time":
    print("Strategy chosen : off_peak_time")
    list_df_train, train_df, test_df, X_train, y_train, X_test, y_test = preprocessing( 
                              timeframes = TIMEFRAMES
                              ,sequence_length = SEQUENCE_LENGTH
                              ,overlap_period = OVERLAP_PERIOD
                              ,resample_period = TIME_STEP
                              ,strategy = STRATEGY)
Strategy chosen : off_peak_time
C:\Users\asus\anaconda3\lib\site-packages\pandas\core\indexing.py:1835: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

C:\Users\asus\anaconda3\lib\site-packages\pandas\core\indexing.py:1835: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

../src\utils.py:222: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Duplicates in test_df :  True
In [8]:
if STRATEGY == "device":
    print("---- train DataFrame Device shape ----")
    print(resampled_device_train_df.shape)
    
    
if STRATEGY == "label":
    print("---- train DataFrame Label shape ----")
    print(resampled_label_train_df.shape)
    
    
if STRATEGY == "off_peak_time":
    print("---- train DataFrame Off-Peak-Time shape ----")
    print(train_df.shape)
---- train DataFrame Off-Peak-Time shape ----
(48352, 3)
In [9]:
print("---- test DataFrame shape ----")
test_df.shape
---- test DataFrame shape ----
Out[9]:
(12088, 3)

Visualize train_df

In [10]:
if STRATEGY == "device":
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=resampled_device_train_df.index, y=resampled_device_train_df['mains'], name='Load Curve'))
    fig.update_layout(showlegend=True, title='resampled_device_train_df')
    fig.show()
    
if STRATEGY == "label":
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=resampled_label_train_df.index, y=resampled_label_train_df['mains'], name='Load Curve'))
    fig.update_layout(showlegend=True, title='resampled_label_train_df')
    fig.show()
    
if STRATEGY == "off_peak_time":
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=train_df.index, y=train_df['mains'], name='Load Curve'))
    fig.update_layout(showlegend=True, title='train_df')
    fig.show()

Visualize test_df

In [11]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=test_df.index, y=test_df['mains'], name='Load Curve'))
fig.update_layout(showlegend=True, title='test_df')
fig.show()

Visualize Train Test split of DataFrame

In [12]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_df.index, y=train_df['mains'], name='Train'))
fig.add_trace(go.Scatter(x=test_df.index, y=test_df['mains'], name='Test'))
fig.update_layout(showlegend=True, title='train test split of Dataframe',xaxis_title="Date",
    yaxis_title="mains (power, normalized)",
    legend_title="Dataframe")
fig.show()
print("train_df[mains].mean()", train_df["mains"].mean())
print("test_df[mains].mean()", test_df["mains"].mean())
train_df[mains].mean() -1.1121425862742852e-15
test_df[mains].mean() -0.3282213667296779

Visualize list_df_train

In [13]:
if STRATEGY == "off_peak_time":
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=list_df_train[6].index, y=list_df_train[6]['mains'], name='Load Curve'))
    fig.update_layout(showlegend=True, title='list_df_train'+str(2))
    fig.show()
    
print("TIME_STEP value : ", TIME_STEP)
TIME_STEP value :  0:01:30

Verify the shape of the sequence

In [14]:
print("---- X_train sequence shape ----")
print(X_train.shape)

print("\n---- y_train sequence shape ----")
print(y_train.shape)

print("\n\n---- X_test sequence shape ----")
print(X_test.shape)

print("\n---- y_test sequence shape ----")
print(y_test.shape)
---- X_train sequence shape ----
(561, 40, 1)

---- y_train sequence shape ----
(561, 40, 3)


---- X_test sequence shape ----
(1506, 40, 1)

---- y_test sequence shape ----
(1506, 40, 3)

Visualize the sequence

In [15]:
print("---- X_train sequence ----")
X_train
---- X_train sequence ----
Out[15]:
array([[[-0.71576861],
        [-0.70545704],
        [-0.71473745],
        ...,
        [-0.82610242],
        [-0.73329828],
        [-0.41157727]],

       [[-0.72298671],
        [-0.55800157],
        [-0.57759356],
        ...,
        [-0.84363209],
        [-0.8384763 ],
        [-0.8240401 ]],

       [[-0.70442588],
        [-0.70751935],
        [-0.70751935],
        ...,
        [-0.44560545],
        [-0.31877313],
        [-0.45591702]],

       ...,

       [[-0.73432944],
        [-0.55387695],
        [-0.56625083],
        ...,
        [-0.74257869],
        [-0.70751935],
        [-0.72298671]],

       [[-0.57140662],
        [-0.61780868],
        [-0.74464101],
        ...,
        [-0.66111728],
        [-0.61883984],
        [-0.75495258]],

       [[-0.72504902],
        [-0.54665885],
        [-0.54975232],
        ...,
        [-0.59512323],
        [-0.55593926],
        [-0.42910694]]])
In [16]:
print("---- y_train sequence ----")
y_train
---- y_train sequence ----
Out[16]:
array([[[Timestamp('2016-03-06 00:00:00'), 0, 0],
        [Timestamp('2016-03-06 00:01:30'), 0, 1],
        [Timestamp('2016-03-06 00:03:00'), 0, 2],
        ...,
        [Timestamp('2016-03-06 00:55:30'), 0, 37],
        [Timestamp('2016-03-06 00:57:00'), 0, 38],
        [Timestamp('2016-03-06 00:58:30'), 0, 39]],

       [[Timestamp('2016-03-06 00:12:00'), 0, 8],
        [Timestamp('2016-03-06 00:13:30'), 0, 9],
        [Timestamp('2016-03-06 00:15:00'), 0, 10],
        ...,
        [Timestamp('2016-03-06 01:07:30'), 0, 45],
        [Timestamp('2016-03-06 01:09:00'), 0, 46],
        [Timestamp('2016-03-06 01:10:30'), 0, 47]],

       [[Timestamp('2016-03-06 00:24:00'), 0, 16],
        [Timestamp('2016-03-06 00:25:30'), 0, 17],
        [Timestamp('2016-03-06 00:27:00'), 0, 18],
        ...,
        [Timestamp('2016-03-06 01:19:30'), 0, 53],
        [Timestamp('2016-03-06 01:21:00'), 0, 54],
        [Timestamp('2016-03-06 01:22:30'), 0, 55]],

       ...,

       [[Timestamp('2016-04-25 01:36:00'), 0, 64],
        [Timestamp('2016-04-25 01:37:30'), 0, 65],
        [Timestamp('2016-04-25 01:39:00'), 0, 66],
        ...,
        [Timestamp('2016-04-25 02:31:30'), 0, 101],
        [Timestamp('2016-04-25 02:33:00'), 0, 102],
        [Timestamp('2016-04-25 02:34:30'), 0, 103]],

       [[Timestamp('2016-04-25 01:48:00'), 0, 72],
        [Timestamp('2016-04-25 01:49:30'), 0, 73],
        [Timestamp('2016-04-25 01:51:00'), 0, 74],
        ...,
        [Timestamp('2016-04-25 02:43:30'), 0, 109],
        [Timestamp('2016-04-25 02:45:00'), 0, 110],
        [Timestamp('2016-04-25 02:46:30'), 0, 111]],

       [[Timestamp('2016-04-25 02:00:00'), 0, 80],
        [Timestamp('2016-04-25 02:01:30'), 0, 81],
        [Timestamp('2016-04-25 02:03:00'), 0, 82],
        ...,
        [Timestamp('2016-04-25 02:55:30'), 0, 117],
        [Timestamp('2016-04-25 02:57:00'), 0, 118],
        [Timestamp('2016-04-25 02:58:30'), 0, 119]]], dtype=object)
In [17]:
print("---- X_test sequence ----")
X_test
---- X_test sequence ----
Out[17]:
array([[[-0.44869892],
        [-0.66111728],
        [-0.6631796 ],
        ...,
        [-0.54459653],
        [-0.44663661],
        [-0.3074304 ]],

       [[-0.58687397],
        [-0.42704462],
        [-0.58481166],
        ...,
        [-0.5765624 ],
        [-0.50541256],
        [-0.51572413]],

       [[-0.32083545],
        [-0.45694818],
        [-0.34867669],
        ...,
        [-0.44869892],
        [-0.3435209 ],
        [-0.44663661]],

       ...,

       [[-0.12388444],
        [-0.14038296],
        [-0.03520493],
        ...,
        [-0.57759356],
        [-0.68277158],
        [-0.68689621]],

       [[-0.55903273],
        [-0.57965587],
        [-0.63224488],
        ...,
        [-0.42704462],
        [-0.56109505],
        [-0.57450009]],

       [[-0.45282355],
        [-0.45488587],
        [-0.57965587],
        ...,
        [-0.70855051],
        [-0.70442588],
        [-0.5549081 ]]])
In [18]:
print("---- y_test sequence ----")
y_test
---- y_test sequence ----
Out[18]:
array([[[Timestamp('2016-04-25 08:48:00'), 0, 0],
        [Timestamp('2016-04-25 08:49:30'), 0, 1],
        [Timestamp('2016-04-25 08:51:00'), 0, 2],
        ...,
        [Timestamp('2016-04-25 09:43:30'), 0, 37],
        [Timestamp('2016-04-25 09:45:00'), 0, 38],
        [Timestamp('2016-04-25 09:46:30'), 0, 39]],

       [[Timestamp('2016-04-25 09:00:00'), 0, 8],
        [Timestamp('2016-04-25 09:01:30'), 0, 9],
        [Timestamp('2016-04-25 09:03:00'), 0, 10],
        ...,
        [Timestamp('2016-04-25 09:55:30'), 0, 45],
        [Timestamp('2016-04-25 09:57:00'), 0, 46],
        [Timestamp('2016-04-25 09:58:30'), 0, 47]],

       [[Timestamp('2016-04-25 09:12:00'), 0, 16],
        [Timestamp('2016-04-25 09:13:30'), 0, 17],
        [Timestamp('2016-04-25 09:15:00'), 0, 18],
        ...,
        [Timestamp('2016-04-25 10:07:30'), 0, 53],
        [Timestamp('2016-04-25 10:09:00'), 0, 54],
        [Timestamp('2016-04-25 10:10:30'), 0, 55]],

       ...,

       [[Timestamp('2016-05-07 21:24:00'), 1, 12024],
        [Timestamp('2016-05-07 21:25:30'), 1, 12025],
        [Timestamp('2016-05-07 21:27:00'), 1, 12026],
        ...,
        [Timestamp('2016-05-07 22:19:30'), 1, 12061],
        [Timestamp('2016-05-07 22:21:00'), 1, 12062],
        [Timestamp('2016-05-07 22:22:30'), 1, 12063]],

       [[Timestamp('2016-05-07 21:36:00'), 1, 12032],
        [Timestamp('2016-05-07 21:37:30'), 1, 12033],
        [Timestamp('2016-05-07 21:39:00'), 1, 12034],
        ...,
        [Timestamp('2016-05-07 22:31:30'), 0, 12069],
        [Timestamp('2016-05-07 22:33:00'), 0, 12070],
        [Timestamp('2016-05-07 22:34:30'), 0, 12071]],

       [[Timestamp('2016-05-07 21:48:00'), 1, 12040],
        [Timestamp('2016-05-07 21:49:30'), 1, 12041],
        [Timestamp('2016-05-07 21:51:00'), 1, 12042],
        ...,
        [Timestamp('2016-05-07 22:43:30'), 0, 12077],
        [Timestamp('2016-05-07 22:45:00'), 0, 12078],
        [Timestamp('2016-05-07 22:46:30'), 0, 12079]]], dtype=object)

Build a model

We will build a convolutional reconstruction autoencoder model. The model will take input of shape (batch_size, sequence_length, num_features) and return output of the same shape. In this case, sequence_length is 10 and num_features is 1.

In [19]:
X_train.shape # 3d Array (samples, SEQUENCE_LENGTH, num_features)
Out[19]:
(561, 40, 1)
In [20]:
X_train.shape[1]
Out[20]:
40
In [21]:
X_train.shape[2]
Out[21]:
1
In [22]:
model = keras.Sequential(
    [
        layers.Input(shape=(X_train.shape[1], X_train.shape[2])),
        layers.Conv1D(
            filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Dropout(rate=0.2),
        layers.Conv1D(
            filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Conv1DTranspose(
            filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Dropout(rate=0.2),
        layers.Conv1DTranspose(
            filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
        ),
        layers.Conv1DTranspose(filters=1, kernel_size=4, padding="same"),
    ]
)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse")
model.summary()
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 conv1d (Conv1D)             (None, 20, 32)            256       
                                                                 
 dropout (Dropout)           (None, 20, 32)            0         
                                                                 
 conv1d_1 (Conv1D)           (None, 10, 16)            3600      
                                                                 
 conv1d_transpose (Conv1DTra  (None, 20, 16)           1808      
 nspose)                                                         
                                                                 
 dropout_1 (Dropout)         (None, 20, 16)            0         
                                                                 
 conv1d_transpose_1 (Conv1DT  (None, 40, 32)           3616      
 ranspose)                                                       
                                                                 
 conv1d_transpose_2 (Conv1DT  (None, 40, 1)            129       
 ranspose)                                                       
                                                                 
=================================================================
Total params: 9,409
Trainable params: 9,409
Non-trainable params: 0
_________________________________________________________________

Train the model

Please note that we are using X_train as both the input and the target since this is a reconstruction model.

In [23]:
history = model.fit(
    X_train,
    X_train,
    epochs=50,
    batch_size=128,
    validation_split=0.1,
    callbacks=[
        keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, mode="min")
    ],
)
Epoch 1/50
4/4 [==============================] - 1s 199ms/step - loss: 0.3762 - val_loss: 0.3038
Epoch 2/50
4/4 [==============================] - 0s 109ms/step - loss: 0.3060 - val_loss: 0.2238
Epoch 3/50
4/4 [==============================] - 0s 23ms/step - loss: 0.2096 - val_loss: 0.1088
Epoch 4/50
4/4 [==============================] - 0s 23ms/step - loss: 0.0983 - val_loss: 0.0325
Epoch 5/50
4/4 [==============================] - 0s 21ms/step - loss: 0.0898 - val_loss: 0.0509
Epoch 6/50
4/4 [==============================] - 0s 24ms/step - loss: 0.0774 - val_loss: 0.0274
Epoch 7/50
4/4 [==============================] - 0s 22ms/step - loss: 0.0602 - val_loss: 0.0379
Epoch 8/50
4/4 [==============================] - 0s 23ms/step - loss: 0.0652 - val_loss: 0.0369
Epoch 9/50
4/4 [==============================] - 0s 22ms/step - loss: 0.0603 - val_loss: 0.0258
Epoch 10/50
4/4 [==============================] - 0s 21ms/step - loss: 0.0532 - val_loss: 0.0224
Epoch 11/50
4/4 [==============================] - 0s 21ms/step - loss: 0.0529 - val_loss: 0.0210
Epoch 12/50
4/4 [==============================] - 0s 20ms/step - loss: 0.0492 - val_loss: 0.0194
Epoch 13/50
4/4 [==============================] - 0s 19ms/step - loss: 0.0465 - val_loss: 0.0198
Epoch 14/50
4/4 [==============================] - 0s 22ms/step - loss: 0.0432 - val_loss: 0.0157
Epoch 15/50
4/4 [==============================] - 0s 24ms/step - loss: 0.0398 - val_loss: 0.0139
Epoch 16/50
4/4 [==============================] - 0s 20ms/step - loss: 0.0365 - val_loss: 0.0131
Epoch 17/50
4/4 [==============================] - 0s 20ms/step - loss: 0.0318 - val_loss: 0.0129
Epoch 18/50
4/4 [==============================] - 0s 23ms/step - loss: 0.0273 - val_loss: 0.0107
Epoch 19/50
4/4 [==============================] - 0s 20ms/step - loss: 0.0269 - val_loss: 0.0097
Epoch 20/50
4/4 [==============================] - 0s 25ms/step - loss: 0.0243 - val_loss: 0.0100
Epoch 21/50
4/4 [==============================] - 0s 21ms/step - loss: 0.0227 - val_loss: 0.0098
Epoch 22/50
4/4 [==============================] - 0s 22ms/step - loss: 0.0219 - val_loss: 0.0091
Epoch 23/50
4/4 [==============================] - 0s 25ms/step - loss: 0.0205 - val_loss: 0.0089
Epoch 24/50
4/4 [==============================] - 0s 19ms/step - loss: 0.0195 - val_loss: 0.0086
Epoch 25/50
4/4 [==============================] - 0s 23ms/step - loss: 0.0197 - val_loss: 0.0081
Epoch 26/50
4/4 [==============================] - 0s 23ms/step - loss: 0.0191 - val_loss: 0.0076
Epoch 27/50
4/4 [==============================] - 0s 22ms/step - loss: 0.0195 - val_loss: 0.0074
Epoch 28/50
4/4 [==============================] - 0s 19ms/step - loss: 0.0176 - val_loss: 0.0068
Epoch 29/50
4/4 [==============================] - 0s 24ms/step - loss: 0.0177 - val_loss: 0.0067
Epoch 30/50
4/4 [==============================] - 0s 24ms/step - loss: 0.0179 - val_loss: 0.0066
Epoch 31/50
4/4 [==============================] - 0s 20ms/step - loss: 0.0169 - val_loss: 0.0066
Epoch 32/50
4/4 [==============================] - 0s 23ms/step - loss: 0.0166 - val_loss: 0.0065
Epoch 33/50
4/4 [==============================] - 0s 24ms/step - loss: 0.0159 - val_loss: 0.0061
Epoch 34/50
4/4 [==============================] - 0s 23ms/step - loss: 0.0159 - val_loss: 0.0063
Epoch 35/50
4/4 [==============================] - 0s 25ms/step - loss: 0.0152 - val_loss: 0.0062
Epoch 36/50
4/4 [==============================] - 0s 20ms/step - loss: 0.0151 - val_loss: 0.0060
Epoch 37/50
4/4 [==============================] - 0s 22ms/step - loss: 0.0148 - val_loss: 0.0064
Epoch 38/50
4/4 [==============================] - 0s 19ms/step - loss: 0.0146 - val_loss: 0.0060
Epoch 39/50
4/4 [==============================] - 0s 23ms/step - loss: 0.0143 - val_loss: 0.0059
Epoch 40/50
4/4 [==============================] - 0s 22ms/step - loss: 0.0141 - val_loss: 0.0062
Epoch 41/50
4/4 [==============================] - 0s 20ms/step - loss: 0.0138 - val_loss: 0.0059
Epoch 42/50
4/4 [==============================] - 0s 16ms/step - loss: 0.0138 - val_loss: 0.0059
Epoch 43/50
4/4 [==============================] - 0s 18ms/step - loss: 0.0136 - val_loss: 0.0059
Epoch 44/50
4/4 [==============================] - 0s 19ms/step - loss: 0.0132 - val_loss: 0.0060
Epoch 45/50
4/4 [==============================] - 0s 19ms/step - loss: 0.0129 - val_loss: 0.0057
Epoch 46/50
4/4 [==============================] - 0s 18ms/step - loss: 0.0127 - val_loss: 0.0059
Epoch 47/50
4/4 [==============================] - 0s 19ms/step - loss: 0.0125 - val_loss: 0.0056
Epoch 48/50
4/4 [==============================] - 0s 20ms/step - loss: 0.0129 - val_loss: 0.0057
Epoch 49/50
4/4 [==============================] - 0s 19ms/step - loss: 0.0121 - val_loss: 0.0059
Epoch 50/50
4/4 [==============================] - 0s 19ms/step - loss: 0.0126 - val_loss: 0.0057

Evaluation of the model

Let's plot training and validation loss to see how the training went.

In [24]:
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()
plt.title("Training & Validation Loss Evolution\n")
plt.show()

Detecting anomalies

We will detect anomalies by determining how well our model can reconstruct the input data.

1/ Find MAE loss on training samples.

2/ Find max MAE loss value. This is the worst our model has performed trying to reconstruct a sample. We will make this the threshold for anomaly detection.

3/ If the reconstruction loss for a sample is greater than this threshold value then we can infer that the model is seeing a pattern that it isn't familiar with. We will label this sample as an anomaly.

In [25]:
X_train_pred = model.predict(X_train)
X_train_pred.shape
18/18 [==============================] - 0s 3ms/step
Out[25]:
(561, 40, 1)
In [26]:
train_mae_loss = np.mean(np.abs(X_train_pred - X_train), axis=1)
train_mae_loss.shape
Out[26]:
(561, 1)
In [27]:
# Get train MAE loss.
X_train_pred = model.predict(X_train)
train_mae_loss = np.mean(np.abs(X_train_pred - X_train), axis=1)

plt.figure(figsize = (10, 7))
sns.distplot(train_mae_loss, bins=50, kde = True)
plt.xlabel("Train MAE loss")
plt.ylabel("No of samples")
plt.show()

# Get reconstruction loss threshold.
threshold = np.max(train_mae_loss)
print("Reconstruction error threshold: ", threshold)
18/18 [==============================] - 0s 5ms/step
C:\Users\asus\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning:

`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).

Reconstruction error threshold:  0.2919210243820084

Compare recontruction

Just for fun, let's see how our model has recontructed the first sample. This is the 40 timesteps from day 1 of our training dataset.

In [28]:
# Checking how the first sequence is learnt
plt.figure(figsize = (10, 5))
plt.plot(X_train[1], label="real load curve")
plt.plot(X_train_pred[1], label="reconstructed load curve")
plt.title("Reconstruction load curve comparison\n", fontsize=15)
plt.legend()
plt.show()

Prepare test data

In [29]:
df_test_value = test_df["mains"]
fig, ax = plt.subplots()
df_test_value.plot(legend=False, ax=ax)
plt.show()
In [30]:
print("Test input shape: ", X_test.shape)
Test input shape:  (1506, 40, 1)
In [31]:
# Get test MAE loss.
X_test_pred = model.predict(X_test)
test_mae_loss = np.mean(np.abs(X_test_pred - X_test), axis=1)
test_mae_loss = test_mae_loss.reshape((-1))

plt.figure(figsize = (10, 5))
sns.distplot(test_mae_loss, bins=50)
plt.xlabel("test MAE loss")
plt.ylabel("No of samples")
plt.show()

# Detect all the samples which are anomalies.
anomalies = test_mae_loss > threshold
print("Number of anomaly samples: ")
print(np.sum(anomalies))

print("\n\nIndices of anomaly samples: ")
print(np.where(anomalies))
48/48 [==============================] - 0s 3ms/step
C:\Users\asus\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning:

`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).

Number of anomaly samples: 
104


Indices of anomaly samples: 
(array([  98,   99,  100,  103,  104,  107,  108,  159,  160,  161,  219,
        220,  221,  234,  235,  237,  239,  276,  277,  278,  279,  280,
        281,  282,  283,  284,  285,  286,  391,  392,  393,  394,  395,
        396,  397,  398,  399,  400,  401,  598,  599,  600,  601,  602,
        603,  604,  605,  606,  607,  608,  609,  610,  611,  612,  613,
        644,  645,  646,  647,  648,  649,  650,  651,  706,  707,  708,
        709,  710,  711,  712,  713,  714,  715,  716,  717,  718,  749,
        750,  751,  752,  753,  754,  755,  756,  760,  761,  762,  763,
        764,  765,  766,  767,  768,  769,  770,  771,  890,  891,  892,
        893,  894,  997,  998, 1306], dtype=int64),)

Visualize the first anomaly sequence

In [32]:
anomalies_idx = np.where(anomalies)
y_test[anomalies_idx[0][0]]
Out[32]:
array([[Timestamp('2016-04-26 04:24:00'), 0, 784],
       [Timestamp('2016-04-26 04:25:30'), 0, 785],
       [Timestamp('2016-04-26 04:27:00'), 0, 786],
       [Timestamp('2016-04-26 04:28:30'), 0, 787],
       [Timestamp('2016-04-26 04:30:00'), 0, 788],
       [Timestamp('2016-04-26 04:31:30'), 0, 789],
       [Timestamp('2016-04-26 04:33:00'), 0, 790],
       [Timestamp('2016-04-26 04:34:30'), 0, 791],
       [Timestamp('2016-04-26 04:36:00'), 0, 792],
       [Timestamp('2016-04-26 04:37:30'), 0, 793],
       [Timestamp('2016-04-26 04:39:00'), 0, 794],
       [Timestamp('2016-04-26 04:40:30'), 0, 795],
       [Timestamp('2016-04-26 04:42:00'), 0, 796],
       [Timestamp('2016-04-26 04:43:30'), 0, 797],
       [Timestamp('2016-04-26 04:45:00'), 0, 798],
       [Timestamp('2016-04-26 04:46:30'), 0, 799],
       [Timestamp('2016-04-26 04:48:00'), 0, 800],
       [Timestamp('2016-04-26 04:49:30'), 0, 801],
       [Timestamp('2016-04-26 04:51:00'), 0, 802],
       [Timestamp('2016-04-26 04:52:30'), 0, 803],
       [Timestamp('2016-04-26 04:54:00'), 0, 804],
       [Timestamp('2016-04-26 04:55:30'), 0, 805],
       [Timestamp('2016-04-26 04:57:00'), 0, 806],
       [Timestamp('2016-04-26 04:58:30'), 0, 807],
       [Timestamp('2016-04-26 05:00:00'), 0, 808],
       [Timestamp('2016-04-26 05:01:30'), 0, 809],
       [Timestamp('2016-04-26 05:03:00'), 1, 810],
       [Timestamp('2016-04-26 05:04:30'), 1, 811],
       [Timestamp('2016-04-26 05:06:00'), 1, 812],
       [Timestamp('2016-04-26 05:07:30'), 1, 813],
       [Timestamp('2016-04-26 05:09:00'), 1, 814],
       [Timestamp('2016-04-26 05:10:30'), 1, 815],
       [Timestamp('2016-04-26 05:12:00'), 1, 816],
       [Timestamp('2016-04-26 05:13:30'), 1, 817],
       [Timestamp('2016-04-26 05:15:00'), 1, 818],
       [Timestamp('2016-04-26 05:16:30'), 1, 819],
       [Timestamp('2016-04-26 05:18:00'), 1, 820],
       [Timestamp('2016-04-26 05:19:30'), 1, 821],
       [Timestamp('2016-04-26 05:21:00'), 1, 822],
       [Timestamp('2016-04-26 05:22:30'), 1, 823]], dtype=object)

Visualize the second anomaly sequence

In [33]:
y_test[anomalies_idx[0][1]]
Out[33]:
array([[Timestamp('2016-04-26 04:36:00'), 0, 792],
       [Timestamp('2016-04-26 04:37:30'), 0, 793],
       [Timestamp('2016-04-26 04:39:00'), 0, 794],
       [Timestamp('2016-04-26 04:40:30'), 0, 795],
       [Timestamp('2016-04-26 04:42:00'), 0, 796],
       [Timestamp('2016-04-26 04:43:30'), 0, 797],
       [Timestamp('2016-04-26 04:45:00'), 0, 798],
       [Timestamp('2016-04-26 04:46:30'), 0, 799],
       [Timestamp('2016-04-26 04:48:00'), 0, 800],
       [Timestamp('2016-04-26 04:49:30'), 0, 801],
       [Timestamp('2016-04-26 04:51:00'), 0, 802],
       [Timestamp('2016-04-26 04:52:30'), 0, 803],
       [Timestamp('2016-04-26 04:54:00'), 0, 804],
       [Timestamp('2016-04-26 04:55:30'), 0, 805],
       [Timestamp('2016-04-26 04:57:00'), 0, 806],
       [Timestamp('2016-04-26 04:58:30'), 0, 807],
       [Timestamp('2016-04-26 05:00:00'), 0, 808],
       [Timestamp('2016-04-26 05:01:30'), 0, 809],
       [Timestamp('2016-04-26 05:03:00'), 1, 810],
       [Timestamp('2016-04-26 05:04:30'), 1, 811],
       [Timestamp('2016-04-26 05:06:00'), 1, 812],
       [Timestamp('2016-04-26 05:07:30'), 1, 813],
       [Timestamp('2016-04-26 05:09:00'), 1, 814],
       [Timestamp('2016-04-26 05:10:30'), 1, 815],
       [Timestamp('2016-04-26 05:12:00'), 1, 816],
       [Timestamp('2016-04-26 05:13:30'), 1, 817],
       [Timestamp('2016-04-26 05:15:00'), 1, 818],
       [Timestamp('2016-04-26 05:16:30'), 1, 819],
       [Timestamp('2016-04-26 05:18:00'), 1, 820],
       [Timestamp('2016-04-26 05:19:30'), 1, 821],
       [Timestamp('2016-04-26 05:21:00'), 1, 822],
       [Timestamp('2016-04-26 05:22:30'), 1, 823],
       [Timestamp('2016-04-26 05:24:00'), 1, 824],
       [Timestamp('2016-04-26 05:25:30'), 1, 825],
       [Timestamp('2016-04-26 05:27:00'), 1, 826],
       [Timestamp('2016-04-26 05:28:30'), 1, 827],
       [Timestamp('2016-04-26 05:30:00'), 1, 828],
       [Timestamp('2016-04-26 05:31:30'), 1, 829],
       [Timestamp('2016-04-26 05:33:00'), 1, 830],
       [Timestamp('2016-04-26 05:34:30'), 1, 831]], dtype=object)
In [34]:
# get index of each sequence considered as an anomaly
sequences_anomalies_idx = list()
for i in range(len(anomalies)):
    if anomalies[i] == True:
        sequences_anomalies_idx.append(i)

# get index of each data point from X_test considered as an anomaly 
data_anomalies_idx = list()
for elm in sequences_anomalies_idx:
    for i in range(SEQUENCE_LENGTH):
        data_idx = y_test[elm][i][2] 
        data_anomalies_idx.append(data_idx)

print("Number of data points considered as anomalies (= activity) : ", len(data_anomalies_idx))
Number of data points considered as anomalies (= activity) :  4160

Plot all the sequences > threshold as an anomalie

In [35]:
df_subset = df_test_value.iloc[data_anomalies_idx]
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_test_value.index, y=df_test_value.values, name='Normal'))
fig.add_trace(go.Scatter(x=df_subset.index, y=df_subset.values, mode='markers', name='Anomaly = Activity (Predicted)'))
fig.update_layout(showlegend=True, title='Detected anomalies')
fig.show()

Plot data point > threshold of each sequences considered as an anomaly

Ici : pour chaque séquence considérée comme une anomalie, on propose de visualiser les points pour lesquels l'écart est le plus important entre le point original et le point reconstruit.

In [36]:
# a = vecteur des timestamps en anomalie
a = y_test[:, :, 0][np.abs((X_test_pred - X_test)>threshold).squeeze()]
a = np.unique(a, return_counts=True)

df_subset = df_test_value.iloc[data_anomalies_idx]
df_subset = df_subset[df_subset.index.isin(a[0])] # a[0] car a[1] contient les comptes
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_test_value.index, y=df_test_value.values, name='Normal'))
fig.add_trace(go.Scatter(x=df_subset.index, y=df_subset.values, mode='markers', name='Anomaly = Activity (Predicted)'))
fig.update_layout(showlegend=True, title='Detected anomalies')
fig.show()

A faire :

  • Prédiction de l'activité, plusieurs options : (nécessite peut-être de créer une nouvelle variable globale)
    • A partir de la séquence complète
    • A partir du compte du nombre de fois où un point est mal reconstruit (erreur calculée sur les points)
    • A partir du compte du nombre de séquences classées comme anomalies dans lequel un point apparaît
  • Risque de surestimation de l'activité, à corriger le cas échéant
  • Affiner le threshold
    • Quantile plutôt que hard maximum sur le train_set : permet d'avoir un peu plus de liberté dans le train (risque de mal se généraliser sinon)
  • Stratégie de train/test split : sur la maison 2 le test n'est pas comparable au train car moins d'énergie
    • Sampler des périodes aléatoires
  • Variables globales, hyperparamètres et modèle:
    • peut-être pas assez de données train -> reduire timestep, augmenter overlap/timeframe
    • Optimiser les HP/VG
    • S'assurer que le modèle est bien dimensionné (pour l'instant assez rapide à entraîner, ne reconstruit peut-être pas aussi bien que souhaité)
  • Plots :
    • Affichage des points en fonction du nombre de fois où ils sont en anomalie sur les séquences
    • anomalie trouvées sur le test vs nuit
    • anomalie trouvées sur le test vs activité
    • erreur de reconstruction
    • 6 exemples de courbes bien reconstruites (pas d'anomalie) et 6 exemples de courbes mal reconstruites (séquence en anomalie)

Old

Pas correct en l'état, à mettre à jour

Analyze the prediction

In [37]:
df_subset_pred = pd.DataFrame(df_subset)
df_subset_pred["activity_pred"] = 1
df_subset_pred
Out[37]:
mains activity_pred
datetime
2016-04-26 04:49:30 -0.611622 1
2016-04-26 04:51:00 -0.614715 1
2016-04-26 04:52:30 -0.151726 1
2016-04-26 04:54:00 -0.105324 1
2016-04-26 04:49:30 -0.611622 1
... ... ...
2016-05-06 06:46:30 -0.656993 1
2016-05-06 06:48:00 -0.656993 1
2016-05-06 06:52:30 -0.537378 1
2016-05-06 06:54:00 -0.631214 1
2016-05-06 06:55:30 -0.638432 1

893 rows × 2 columns

In [38]:
fig, ax = plt.subplots()

plot_activity_hist(df_subset_pred['activity_pred'], figsize=(12, 6), alpha=0.5, label='predictions', ax=ax)
plot_activity_hist(test_df["activity"], figsize=(12, 6), alpha=0.5, label='truth', color='tab:orange', ax=ax)
In [40]:
test_df_eval = test_df.copy()
test_df_eval["activity"] = 0
idx_anom = df_subset_pred.index
test_df_eval.loc[idx_anom, "activity"] = 1
test_df_eval
Out[40]:
mains activity hour
datetime
2016-04-25 08:48:00 -0.448699 0 8.800000
2016-04-25 08:49:30 -0.661117 0 8.816667
2016-04-25 08:51:00 -0.663180 0 8.850000
2016-04-25 08:52:30 -0.636370 0 8.866667
2016-04-25 08:54:00 -0.783825 0 8.900000
... ... ... ...
2016-05-07 22:52:30 -0.697208 0 22.866667
2016-05-07 22:54:00 -0.655961 0 22.900000
2016-05-07 22:55:30 -0.679678 0 22.916667
2016-05-07 22:57:00 -0.572438 0 22.950000
2016-05-07 22:58:30 -0.575531 0 22.966667

12088 rows × 3 columns

Analyze the activity prediction distribution (Activity VS Non Activity)

In [50]:
sns.histplot(data=test_df_eval, x="activity").set(title='Activity prediction distribution (Activity VS Non Activity)')
Out[50]:
[Text(0.5, 1.0, 'Activity prediction distribution (Activity VS Non Activity)')]
In [42]:
test_df_eval.activity.value_counts()
Out[42]:
0    11854
1      234
Name: activity, dtype: int64

Plot confusion matrix

In [43]:
plot_confusion_matrix(test_df["activity"], test_df_eval['activity'])
Score f_beta : 39.059%
Score accuracy : 57.892%
Out[43]:
(0.3905891243959971, 0.578921244209133)
In [44]:
gdfgfgdfdf
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-44-c24f5e1dfcf5> in <module>
----> 1 gdfgfgdfdf

NameError: name 'gdfgfgdfdf' is not defined

Post Processing

In [ ]:
%%time

def post_processing(y_test, sequence_length, data_anomalies_idx):
    """
    Post process the model prediction with different strategies
    Args:
        - y_test: 3D-array that contain sequence of timestamp, activity labels and index (Ex: [Timestamp('2016-04-25 08:48:00'), 0, 0])
        - sequence_length : The length of the y_test sequence
        - data_anomalies_idx : The list of index of each sequence predict as an anomaly by the model
    Returns:
        - DataFrame:
            - Timestamp: datetime of the time series 
            - list_idx_sequence_no_activity : index of the sequence for which no activity is predicted
            - list_idx_sequence_activity : index of the sequence for which activity is predicted
            - nb_no_activity : number of times the timestamp is in a sequence for which no activity has been predicted by the model 
            - nb_activity : number of times the timestamp is in a sequence for which activity has been predicted by the model
            - total : nb_no_activity + nb_activity
            - method_prediction_1 : Process a Majority Vote between no_activity_rate attribute and activity_rate attribute
            
        - Export DataFrame to pickle format
    """
    
    # Init dataframe with prediction
    data_prediction = pd.DataFrame(columns=['Timestamp', 
                              'list_idx_sequence_no_activity',
                              'list_idx_sequence_activity', 
                              'nb_no_activity',
                              'nb_activity', 
                              'total'])

    timestamp_list = list()
    for i in range(y_test.shape[0]):
        for k in range(sequence_length):
            timestamp_list.append(y_test[i][k][0])

    # drop duplicate
    timestamp_list = list(dict.fromkeys(timestamp_list))
    print(len(timestamp_list))

    counter = 0
    for timestamp in timestamp_list:
        counter = counter + 1
        print(str(timestamp) + ", " + str(counter))
        list_idx_sequence_no_activity = list()
        list_idx_sequence_activity = list()
        for i in range(y_test.shape[0]):
            for k in range(sequence_length):
                if timestamp == y_test[i][k][0]:
                    if i in sequences_anomalies_idx:
                        list_idx_sequence_activity.append(i)
                    else:
                        list_idx_sequence_no_activity.append(i)

        data_prediction = data_prediction.append({'Timestamp': timestamp, 
                          'list_idx_sequence_no_activity': list_idx_sequence_no_activity,
                          'list_idx_sequence_activity': list_idx_sequence_activity, 
                          'nb_no_activity': len(list_idx_sequence_no_activity),
                          'nb_activity': len(list_idx_sequence_activity), 
                          'total': len(list_idx_sequence_no_activity) + len(list_idx_sequence_activity)}
                          ,ignore_index=True)
        
    ### Majortity vote post process strategy ###
    # Process a **Majority Vote** between no_activity_rate attribute and activity_rate attribute
    data_prediction["method_prediction_1"] = np.where(data_prediction["nb_activity"] > data_prediction["nb_no_activity"], 1, 0)

        
    # Export prediction to .pickle format
    data_prediction.to_pickle("./data_prediction.pkl")
        
    return data_prediction
    
data_prediction = post_processing(y_test, SEQUENCE_LENGTH, data_anomalies_idx)
In [ ]:
data_prediction
In [ ]:
data_prediction.total.value_counts()
In [ ]: